#!/usr/bin/perl -w
#Name    	: 2_Assign_groups.pl
#Author  	: Morgan, Matthew
#Created 	: 07/2010
#Modified	: 02/2012
#Purpose	: BLAST sequences against NCBI nr data base and return best hit information appended to original data 
#Syntax		: perl 2_Assign_groups.pl [sequence file.txt]
#Further info	: Further information regarding this script and APDP can be found in the documentation downloaded with this file, and in Morgan et al., (in review)
#This code is largely borrowed from the CPAN Synopsis for the RemoteBlast module.  This can be found at http://search.cpan.org/~cjfields/BioPerl-1.6.901/Bio/Tools/Run/RemoteBlast.pm
#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.

#########################################################################################################################################################	
#																			#
#CSIRO Open Source Software License Agreement (GPLv3)													#
#																			#
#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.					#
#																			#
#All rights reserved. CSIRO is willing to grant you a license to APDP on the terms of the GNU General Public License version 3				#
# as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except where otherwise indicated for third party material.	#
#The following additional terms apply under clause 7 of that license:											#
#																			#
#EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO AND ITS		#
#CONTRIBUTORS MAKE NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS,	#
#WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,		#
#NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE.				#
#																			#
#TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT		#
#LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED.		#
#WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS,		#
#DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS;		#
#OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS LICENCE, THE USE		#
#OF THE SOFTWARE OR THE USE OF OR OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO OR ITS CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF		#
#SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY.														#
#																			#
#APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY IMPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS		#
#OR LIABILITY ON CSIRO OR ONE OF ITS CONTRIBUTORS IN RESPECT OF THE SOFTWARE THAT CANNOT BE WHOLLY OR PARTLY EXCLUDED, RESTRICTED OR			#
#MODIFIED "CONSUMER GUARANTEES".  IF SUCH CONSUMER GUARANTEES APPLY THEN THE LIABILITY OF CSIRO AND ITS CONTRIBUTORS IS LIMITED, TO THE FULL		#
#EXTENT PERMITTED BY THE APPLICABLE LEGISLATION.  WHERE THE APPLICABLE LEGISLATION PERMITS THE FOLLOWING REMEDIES TO BE PROVIDED FOR BREACH OF		#
#THE CONSUMER GUARANTEES THEN, AT ITS OPTION, CSIRO'S LIABILITY IS LIMITED TO ANY ONE OR MORE OF THEM:							#
#1.          THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN;					#
#2.          THE REPAIR OF THE SOFTWARE; 														#
#3.          THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN,		#
#	     OR HAVING THE SOFTWARE REPAIRED.														#
#																			#
#########################################################################################################################################################

use warnings;
use strict;
use Bio::Tools::Run::RemoteBlast;
use Bio::SeqIO;
use Time::HiRes qw (sleep);

$| = 1;
my ( %seen, %allhits, %allbits, %hold, $hitinfo, $taxdata );
my $count = 0;

#Read in sequence data from tab-delimited text file

open( TAX,    "<$ARGV[0]" );
open( OUTPUT, ">Filtered_unique_sequences_groups.txt" )
  || die("Can't open OUTPUT");
open( OUTPUT1, ">Filtered_unique_sequences_no_significant_similarity.txt" );

#Prompt for environmental filter and blast algorithm to use

print "Include metagenomic and environmental samples? (Y/N) : "; 
my $filter = <STDIN>;
chomp($filter);

print "Blast algorithm to use (0=blastn; 1=megablast) : ";
my $alg = <STDIN>;
chomp($alg);

#Set BLAST parameters

my $prog   = 'blastn';
my $db     = 'nt';
my $e_val  = '10';
my @params = (
    '-prog'       => $prog,
    '-data'       => $db,
    '-expect'     => $e_val,
    '-readmethod' => 'SearchIO'
);

my $factory = Bio::Tools::Run::RemoteBlast->new(@params);

#Parse sequence data line by line, set further BLAST parameters

while (<TAX>) {
    if (/^name/) {
	chomp;	
	print OUTPUT "$_\tname\tBitScore\tGI\tAccession\tID\trecalcID\trecalcCov\tNumBestHits\tAllBestHits\n";        
	next;
    }
    my $linenumber = $. - 1;
    print "\n$linenumber\n";
    chomp;
    my $line = $_;
    my @tmp = split( /\t/, $_ );
    open BLAST, "+>currentblast.fasta";
    print BLAST ">$tmp[0]\n$tmp[3]\n";
    print "$tmp[0]\n";
    close(BLAST);
    my $str =
      Bio::SeqIO->new( -file => 'currentblast.fasta', '-format' => 'Fasta' );
    my $filt;

    if ( $filter eq "Y" ) {
        $filt = "all[filter]";
    }
    else {
        $filt =
          "all[filter] NOT(environmental samples[filter] OR metagenomes[orgn])";
    }
    $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'}          = $filt;
    $Bio::Tools::Run::RemoteBlast::RETRIEVALHEADER{'NCBI_GI'}      = 'yes';
    $Bio::Tools::Run::RemoteBlast::RETRIEVALHEADER{'ALIGNMENTS'}   = 10;
    $Bio::Tools::Run::RemoteBlast::RETRIEVALHEADER{'DESCRIPTIONS'} = 10;
    if ( $alg eq "1" ) {
	$Bio::Tools::Run::RemoteBlast::HEADER{'MEGABLAST'}         = 'yes';
    }
    else {
	$Bio::Tools::Run::RemoteBlast::HEADER{'MEGABLAST'}         = 'no';
    }
    
#Submit BLAST search to NCBI

    my $r = $factory->submit_blast('currentblast.fasta');

#Retrieve BLAST output

    while ( my @rids = $factory->each_rid ) {
        foreach my $rid (@rids) {
            my $rc = $factory->retrieve_blast($rid);
            if ( !ref($rc) ) {
                if ( $rc < 0 ) {
                    $factory->remove_rid($rid);
                }
                sleep 5;
            }
            
#Parse BLAST output to retrieve required information

	    else {
                $factory->remove_rid($rid);
                my $result = $rc->next_result();
                %allhits = ();
		%hold    = ();
                my $hit_num = 0;
                while ( my $hit = $result->next_hit ) {
                    $hit_num++;
                    while ( my $hsp = $hit->next_hsp ) {
                        my $gi      = $hit->ncbi_gi;
			my $hitAcc = $hit->accession;                        
			my $hitBits = $hit->bits;
			my $percent_id = sprintf ("%.2f", $hsp->percent_identity);
			my $numID = $hsp->num_identical;
			my $queryStart = $hsp->start('query');
			my $queryEnd = $hsp->end('query');        			
			my $queryLength = $result->query_length;
			my $Hardy_percentID = sprintf ("%.2f", (100*($numID / $queryLength))); 
			my $Hardy_percent_coverage =  sprintf ("%.2f", (100*((1+($queryEnd - $queryStart))/$queryLength)));				
			if (!defined($hold{$gi}{'acc'})){ #retains values for best hsp if one hit has many hsps			
				$hold{$gi}{'acc'}   = $hitAcc;                        
				$hold{$gi}{'ID'}    = $percent_id;
				$hold{$gi}{'CHID'}  = $Hardy_percentID;
				$hold{$gi}{'CHCOV'} = $Hardy_percent_coverage;
			}
			if ( defined( $allhits{$hitBits} ) ) {
                            my @new = @{ $allhits{$hitBits} };
                            push @new, $gi;
                            $allhits{$hitBits} = \@new;
                        }
                        else {
                            my @next;
                            push @next, $gi;
                            $allhits{$hitBits} = \@next;
                        }
                    }    
                }  

#Sort hits by bit score and retrieve information for first hit with top bitscore.  Retain GIs of all best hits.

                my @bits = sort { $b <=> $a } ( keys %allhits );
                if ( scalar(@bits) == 0 ) {
                    print "\tNO SIGNIFICANT SIMILARITY\n";
                    print OUTPUT1
                      "$tmp[0]\t$tmp[1]\tNO SIGNIFICANT SIMILARITY\n";
                    next;
                }
                else {
                    my $best_hit = $bits[0];
                    my @allgis   = @{ $allhits{$best_hit} };
                    my $top      = $allgis[0];
		    my $acc      = $hold{$top}{'acc'}; 		    
		    my $id       = $hold{$top}{'ID'};                   
		    my $chid     = $hold{$top}{'CHID'};
		    my $chcov    = $hold{$top}{'CHCOV'};
		    my $naccs    = @allgis;
		    $line        =~ s/\r//;
                    
		    print OUTPUT "$line\t$tmp[0]\t$best_hit\t$top\t$acc\t$id\t$chid\t$chcov\t$naccs\t@allgis\n";
                } 
            } 
        } 
    } 
    my $time = time - $^T;
    print "Time taken: $time\n";

}  
close(OUTPUT);
close(OUTPUT1);
